Scott Hendrickson (2014-03-20)
Classification:
In [2]:
import numpy as np
import pandas as pd
from ggplot import *
%matplotlib inline
In [13]:
dat_1d = {"x" :[2, 2, 1, 0.25, -0.4, -1, -2, -2, 1],
"label":[0, 0, 0, 0, 1, 1, 1, 1, 1]}
df_1d = pd.DataFrame.from_dict(data = dat_1d)
df_1d
Out[13]:
In [14]:
df_1d["label_fact"] = pd.Categorical.from_array([str(x) for x in df_1d.label])
ggplot(aes(x="x",y="label", color="label_fact"), data=df_1d) + geom_point()
Out[14]:
Properties of function of distance from boundary:
$f(x) = \frac{1}{1 + \exp(-z)}$
In [15]:
def sigmoid(x):
# vector in, vector out
return 1./(1. + np.exp(-x))
# Let's see it
d_sig = pd.DataFrame.from_dict({"x": np.linspace(-8,8,30),
"y": sigmoid(np.linspace(-8,8,30))})
ggplot(aes(x="x", y="y"), data = d_sig) + geom_line(color="orange")
Out[15]:
Model predicts $l(x) = [0,1]$ based on sigmoid. Calculate argument to sigmoid from input data, e.g.:
For 1-D input vectors:
$z = \beta_0 + \beta_1 x$
Or 2-D input vectors:
$z = \beta_0 + \beta_1 x_1 + \beta_2 x_2$
$\ldots$
Model prediction in 1-D:
$y(x) = sigmoid(\beta_0 + \beta_1 x) = \frac{1}{1 + \exp(-(\beta_0 + \beta_1 x))}$
Need a cost function for which optimization will give the boundary N-dim plane defined by $[\beta_0, \beta_1, \ldots ]$.
$Cost = \frac{1}{m} \sum{\frac{1}{2}(l(x) - y(x))^2}$
This is the cost if the model prediction is $y(x)$ and the actual outcome is $l(x)$. With the sigmoid, this funciton is not convex, so local minima are challenging.
Why is this a problem?
Lots of local minima mean gradient descent may not find the global optimum. We would like a convex function so if you run gradient descent and converge to the global minimum.
For model y(x):
$C = -[1-l(x)]\ln[l(x)-y(x)] - l(x) \ln[y(x)] $
$C = -[\text{cost for label 0}] - [\text{cost for label 1}]$
In [17]:
def cost(label_arr, model_arr):
# inputs are nb.array([])
# sum over training set
c = -(1. - label_arr)*np.log(1. - model_arr) - label_arr*np.log(model_arr)
return np.sum(c)/len(c)
print "Model ~ Label: {:.4}".format(cost(np.array([0,0,0,1,1,1]), np.array([0.1,0.2,0.05,0.99,0.98,0.99])))
print "Model !~ Label: {:.4}".format(cost(np.array([0,0,0,1,1,1]), np.array([0.99,0.98,0.99,0.1,0.2,0.05])))
In [18]:
def model_1d(model_vec, df):
# Calculate model output
z = model_vec[0] + model_vec[1] * df.x
return sigmoid(z)
def model_cost(model_vec, df, model):
# model_vec is python vector
# df is data frame with column x and column label
# model is a functionto calculate model output
return cost(df.label, model(model_vec, df))
In [19]:
from scipy.optimize import minimize as mini
res_1d = mini(model_cost, x0=[0.1,0.1], args=(df_1d, model_1d))
print (res_1d)
In [20]:
df_pred_1d = pd.DataFrame.from_dict({"x": np.linspace(-3,3,99)})
df_pred_1d["y"] = model_1d(res_1d.x, df_pred_1d)
df_pred_1d.head()
Out[20]:
In [21]:
ggplot(aes(x="x",y="label", color="label_fact"), data=df_1d) + \
geom_point() + \
geom_line(aes(x="x", y="y"), color="green", data=df_pred_1d)
Out[21]:
In [22]:
dat_2d = { "x1":[-1.0, 0.3, 2.05, 0.95, 2,-1.1,-2.4,-0.9,-2.1],
"x2":[ 2.0, 0.6, 2.1, 2.1, -1,-1 ,-1.8, -2, 0.0],
"label":[0,0,0,0,0,1,1,1,1]}
df_2d = pd.DataFrame.from_dict(data = dat_2d)
df_2d["label_fact"] = pd.Categorical.from_array([str(x) for x in df_2d.label])
df_2d
Out[22]:
In [23]:
ggplot(aes(x="x1",y="x2", color="label_fact"), data=df_2d) + geom_point()
Out[23]:
In [24]:
def model_2d(model_vec, df):
z = model_vec[0] + model_vec[1] * df.x1 + model_vec[2] * df.x2
return sigmoid(z)
In [25]:
res_2d = mini(model_cost, x0=[1,1,0.5], args=(df_2d, model_2d))
print (res_2d)
In [27]:
# z = 0 defines a line
x1 = np.linspace(-3,3,27)
x2 = -(res_2d.x[0] + x1*res_2d.x[1])/res_2d.x[2]
df_pred_2d = pd.DataFrame.from_dict({ "x1": x1,
"x2": x2 })
df_pred_2d["y"] = model_2d(res_2d.x, df_pred_2d)
df_pred_2d.head()
Out[27]:
In [29]:
ggplot(aes(x="x1", y="x2"),data=df_pred_2d) + geom_line() + geom_point(aes(x="x1",y="x2", color="label_fact"), data=df_2d)
Out[29]:
In [31]:
from scipy.stats import norm
# Make some data
npts = 65
x1 = np.concatenate((norm.rvs(loc=2, scale=1.5, size=npts), norm.rvs(loc=-2, scale=1.5, size=npts)))
x2 = np.concatenate((norm.rvs(loc=2, scale=1.5, size=npts), norm.rvs(loc=-1, scale=1.5, size=npts)))
lab = [0]*npts + [1]*npts
# Plots and fits
dat_gaus = { "x1": x1,
"x2": x2 ,
"label": lab }
df_2d_gaus = pd.DataFrame.from_dict(data = dat_gaus)
df_2d_gaus["label_fact"] = pd.Categorical.from_array([str(x) for x in df_2d_gaus.label])
# Fit
res_2d_gaus = mini(model_cost, x0=[0.1,1.2,0.5], args=(df_2d_gaus, model_2d))
# z = 0 defines a line
x1 = np.linspace(-4, 4, 27)
x2 = -(res_2d_gaus.x[0] + x1*res_2d_gaus.x[1])/res_2d_gaus.x[2]
print res_2d_gaus
df_2d_gaus_pred = pd.DataFrame.from_dict({"x1": x1, "x2": x2})
df_2d_gaus_pred["y"] = model_2d(res_2d_gaus.x, df_2d_gaus)
ggplot(aes(x="x1", y="x2"), data=df_2d_gaus_pred) + \
geom_line(color="red") + \
geom_point(aes(x="x1",y="x2", color="label_fact"), data=df_2d_gaus)
Out[31]:
In [34]:
def model_quad(model_vec, df):
# quadratic
z = model_vec[0] + model_vec[1]*df.x2 + model_vec[2]*df.x1*df.x1 + model_vec[3]*df.x1
return sigmoid(z)
# Make some data
npts = 100
x1 = np.concatenate((norm.rvs(loc=-3,scale=1, size=npts),
norm.rvs(loc=0, scale=1, size=npts),
norm.rvs(loc=0, scale=1, size=npts),
norm.rvs(loc=3, scale=1, size=npts)))
x2 = np.concatenate((norm.rvs(loc=4, scale=6, size=npts),
norm.rvs(loc=-5, scale=2, size=npts),
norm.rvs(loc=6, scale=2, size=npts),
norm.rvs(loc=4, scale=6, size=npts)))
lab = [0]*npts + [1]*npts + [0]*npts + [0]*npts
# Plots and fits
dat_quad = { "x1": x1,
"x2": x2,
"label": lab }
df_quad = pd.DataFrame.from_dict(data = dat_quad)
df_quad["label_fact"] = pd.Categorical.from_array([str(x) for x in df_quad.label])
# Fit
res_quad = mini(model_cost, x0=[0.5, 1.2, -1, 2], args=(df_quad, model_quad))
print res_quad
# z = 0 defines a line
x1 = np.linspace(-3,3,21)
x2 = -(res_quad.x[0] + res_quad.x[3]*x1 + res_quad.x[2]*x1*x1)/res_quad.x[1]
df_quad_pred = pd.DataFrame.from_dict({"x1": x1, "x2": x2})
df_quad_pred["y"] = model_2d(res_quad.x, df_quad)
ggplot(aes(x="x1", y="x2"), data=df_quad_pred) + \
geom_line(color="red") + \
geom_point(aes(x="x1",y="x2", color="label_fact"), data=df_quad)
Out[34]:
Followup ideas that are important to success with machine learning:
In [ ]: